library(tidyverse)
library(stringr)
library(caret)
library(plotly)
library(ggthemes)
library(GGally)
library(class)
library(e1071)
library(stringr)
This analysis is about the number of beers consumed by American People in the United States In this docoument we conduct an analysis about
#Cleaning ABV using mean
df_beers_cl0 = df_beers
nr_mean_abv = mean(df_beers_cl0[!is.na(df_beers_cl0$ABV),]$ABV)
length_abv = length(df_beers_cl0[is.na(df_beers_cl0$ABV),]$ABV)
if(length_abv > 0){
df_beers_cl0[is.na(df_beers_cl0$ABV),]$ABV = nr_mean_abv
}
#Cleaning IBU using mean
df_beers_cl1 = df_beers
nr_mean_ibu = mean(df_beers_cl1[!is.na(df_beers_cl1$IBU),]$IBU)
length_ibu = length(df_beers_cl1[is.na(df_beers_cl1$IBU),]$IBU)
if(length_ibu > 0){
df_beers_cl1[is.na(df_beers_cl1$IBU),]$IBU = nr_mean_ibu
}
#Cleaning using KnnInpute
# preProcValues <- preProcess(df_beers %>%
# select(ABV,IBU),
# method = c("knnImpute"),
# k = 20,
# knnSummary = mean)
# df_beers_unp <- predict(preProcValues, df_beers,na.action = na.pass)
# procNames <- data.frame(col = names(preProcValues$mean), mean = preProcValues$mean, sd = preProcValues$std)
# for(i in procNames$col){
# df_beer_info[i] <- df_beer_info[i]*preProcValues$std[i]+preProcValues$mean[i]
# }
knn_imp_model <- preProcess(df_beers_cl0 %>%
select(ABV,IBU),
method = c("knnImpute"),
k = 20,
knnSummary = mean)
df_beers_unp <- predict(knn_imp_model, df_beers_cl0,na.action = na.pass)
procNames <- data.frame(col = names(knn_imp_model$mean), mean = knn_imp_model$mean, sd = knn_imp_model$std)
for(i in procNames$col){
df_beers_unp[i] <- df_beers_unp[i]*knn_imp_model$std[i]+knn_imp_model$mean[i]
}
#Cleansing beers, records 2410
nr_rows = dim(df_beers)[1]
#Finding the NAN values
#summary(df_beers)
# df_beers %>% ggplot(aes(x=IBU))+geom_histogram(aes(fill="green"))
# df_beers %>% ggplot(aes(x=ABV))+geom_histogram(aes(fill="green"))
#
# df_beers_cl1 %>% ggplot(aes(x=IBU))+geom_histogram(aes(fill="blue"))
# df_beers_cl1 %>% ggplot(aes(x=ABV))+geom_histogram(aes(fill="blue"))
df_beers_unp %>% ggplot(aes(x=IBU))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df_beers_unp %>% ggplot(aes(x=ABV))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df_beers_unp %>% ggplot(aes(y=ABV))+geom_boxplot()
summary(df_beers_unp)
## Name Beer_ID ABV IBU
## Length:2410 Min. : 1.0 Min. :0.00100 Min. : 4.00
## Class :character 1st Qu.: 808.2 1st Qu.:0.05000 1st Qu.: 24.20
## Mode :character Median :1453.5 Median :0.05700 Median : 36.70
## Mean :1431.1 Mean :0.05977 Mean : 42.92
## 3rd Qu.:2075.8 3rd Qu.:0.06700 3rd Qu.: 60.00
## Max. :2692.0 Max. :0.12800 Max. :138.00
## Brewery_id Style Ounces
## Min. : 1.0 Length:2410 Min. : 8.40
## 1st Qu.: 94.0 Class :character 1st Qu.:12.00
## Median :206.0 Mode :character Median :12.00
## Mean :232.7 Mean :13.59
## 3rd Qu.:367.0 3rd Qu.:16.00
## Max. :558.0 Max. :32.00
df_summary = df_breweries_2 %>% group_by(State,Name_State) %>% summarize(NumberBreweries = n())
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
knitr::kable(
df_summary,
caption = "Number of Beers by State"
)
| State | Name_State | NumberBreweries |
|---|---|---|
| AK | Alaska | 7 |
| AL | Alabama | 3 |
| AR | Arkansas | 2 |
| AZ | Arizona | 11 |
| CA | California | 39 |
| CO | Colorado | 47 |
| CT | Connecticut | 8 |
| DC | District of Columbia | 1 |
| DE | Delaware | 2 |
| FL | Florida | 15 |
| GA | Georgia | 7 |
| HI | Hawaii | 4 |
| IA | Iowa | 5 |
| ID | Idaho | 5 |
| IL | Illinois | 18 |
| IN | Indiana | 22 |
| KS | Kansas | 3 |
| KY | Kentucky | 4 |
| LA | Louisiana | 5 |
| MA | Massachusetts | 23 |
| MD | Maryland | 7 |
| ME | Maine | 9 |
| MI | Michigan | 32 |
| MN | Minnesota | 12 |
| MO | Missouri | 9 |
| MS | Mississippi | 2 |
| MT | Montana | 9 |
| NC | North Carolina | 19 |
| ND | North Dakota | 1 |
| NE | Nebraska | 5 |
| NH | New Hampshire | 3 |
| NJ | New Jersey | 3 |
| NM | New Mexico | 4 |
| NV | Nevada | 2 |
| NY | New York | 16 |
| OH | Ohio | 15 |
| OK | Oklahoma | 6 |
| OR | Oregon | 29 |
| PA | Pennsylvania | 25 |
| RI | Rhode Island | 5 |
| SC | South Carolina | 4 |
| SD | South Dakota | 1 |
| TN | Tennessee | 3 |
| TX | Texas | 28 |
| UT | Utah | 4 |
| VA | Virginia | 16 |
| VT | Vermont | 10 |
| WA | Washington | 23 |
| WI | Wisconsin | 20 |
| WV | West Virginia | 1 |
| WY | Wyoming | 4 |
df_beerbre_unp = merge(df_beers_unp,df_breweries_2,by.x = "Brewery_id",by.y = "Brew_ID")
df_acom_bebrew_1 = df_beerbre_unp %>% group_by(State,Name_State) %>% summarize(Median_ABV = mean(ABV),Median_IBU = mean(IBU))
# df_acom_bebrew_1 %>% ggplot(aes(x=State,color=State))+geom_bar()+labs(title = "Alcohol by Volume",subtitle = "Alcohol by Volume average by State")+coord_flip()
df_sort_1 = arrange(df_beerbre_unp,desc(ABV)) %>% head(n = 1)
sprintf("The state that has the maximum ABV is %s-%f",df_sort_1$Name_State,df_sort_1$ABV)
## [1] "The state that has the maximum ABV is Colorado-0.128000"
1.0.7 Comment on the summary statistics and distribution of the ABV variable
1.0.7.1 Accordint to the histogram we can notice that the data seems normally distributed
###Activity 8, Difference with respect to IBU and ABV IPA and ALE